The first step in this analysis would be to import all libraries that are going to be used during these tasks:
library(tidyverse)
library(ggplot2)
library(plotly)
library(countrycode)
library(kableExtra)
Then reading in the gapminder_clean.csv data as a tibble using read_csv:
data <- read.csv("gapminder_clean.csv") %>%
as_tibble()
And renaming some columns to avoid the repetition of bloated names:
colnames(data)[colnames(data) == "CO2.emissions..metric.tons.per.capita."] <- "co2_emissions"
colnames(data)[colnames(data) == "Population.density..people.per.sq..km.of.land.area."] <- "population_density"
colnames(data)[colnames(data) == "Imports.of.goods.and.services....of.GDP."] <- "imports"
colnames(data)[colnames(data) == "Energy.use..kg.of.oil.equivalent.per.capita."] <- "energy_use"
colnames(data)[colnames(data) == "Life.expectancy.at.birth..total..years."] <- "life_expectancy"
First off, it’s requested that I plot a graph of countries’ CO2 emissions matched with their GDP per capita in 1962. To achieve that, we must filter the data so we get just the data on the relevant year:
data_on_62 <- data %>%
filter(Year == 1962)
With this data, we can already plot the respective dot plot:
ggplot(data_on_62, aes(x = co2_emissions, y = gdpPercap)) +
geom_point() +
labs(
x = "CO2 emissions (metric tons per capita)", y = "GDP per capita",
title = "GDP per capita variation according to CO2 emissions"
)
And also determine the correlation between these variables:
cor_test_res <- cor.test(data_on_62$co2_emissions, data_on_62$gdpPercap)
cor_test_res
##
## Pearson's product-moment correlation
##
## data: data_on_62$co2_emissions and data_on_62$gdpPercap
## t = 25.269, df = 106, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8934697 0.9489792
## sample estimates:
## cor
## 0.9260817
It is then requested that I calculate in which year the correlation between the CO2 emissions and GDP per capita is stronger. It can be done in the following manner:
cor_table <- data %>%
select(Country.Name, Year, gdpPercap, co2_emissions) %>%
drop_na() %>%
group_by(Year) %>%
summarize(cor = cor(co2_emissions, gdpPercap))
cor_table <- cbind(Rank = seq(1, 10), cor_table[order(-cor_table$cor), ])
cor_table %>%
kbl(caption = "Correlation between the CO2 emissions and GDP per capita in its respective years") %>%
kable_styling(bootstrap_options = "striped", full_width = F)
| Rank | Year | cor |
|---|---|---|
| 1 | 1967 | 0.9387918 |
| 2 | 1962 | 0.9260817 |
| 3 | 1972 | 0.8428986 |
| 4 | 1982 | 0.8166384 |
| 5 | 1987 | 0.8095531 |
| 6 | 1992 | 0.8094316 |
| 7 | 1997 | 0.8081396 |
| 8 | 2002 | 0.8006421 |
| 9 | 1977 | 0.7928336 |
| 10 | 2007 | 0.7204169 |
And finally, we will plot the same dot plot as before, but coloring the dots according to their continents and sizing them according to their population:
co2_gdp_scatterplot <- data_on_62 %>%
select(Country.Name, Year, co2_emissions, continent, pop, gdpPercap) %>%
drop_na() %>%
ggplot(aes(
x = co2_emissions,
y = gdpPercap,
color = continent,
size = pop
)) +
geom_point(alpha = 0.5) +
labs(
x = "CO2 emissions (metric tons per capita)", y = "GDP per capita",
title = "GDP per capita variation according to CO2 emissions",
) +
scale_color_discrete(name = "Continent") +
scale_size("", range = c(1, 10))
ggplotly(co2_gdp_scatterplot)
What is the relationship between continent and ‘Energy use (kg of oil equivalent per capita)’?
As a relation with a categorical predictor variable and a single quantitative outcome to be evaluated in respect to multiple groups, the adequate statistical test to be implemented in this case is ANOVA. And aplying this method, we get the following results:
energy_continent_anova <- aov(energy_use ~ continent, data = data)
summary(energy_continent_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## continent 5 8.124e+08 162482656 21.88 <2e-16 ***
## Residuals 1404 1.043e+10 7426183
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1197 observations deleted due to missingness
Is there a significant difference between Europe and Asia with respect to ‘Imports of goods and services (% of GDP)’ in the years after 1990?
In contrast, although this case also features a categorical predictor variable and single quantitative outcome, they are grouped according to just two continents. So, in this case we should analyse it with a t-test:
relevant_continents <- c("Europe", "Asia")
data_as_eu_after_90 <- data %>%
select(Country.Name, Year, imports, continent) %>%
filter(continent %in% relevant_continents, Year > 1990)
europe_asia_imports_t_test <- t.test(data_as_eu_after_90$imports[data_as_eu_after_90$continent == "Europe"], data_as_eu_after_90$imports[data_as_eu_after_90$continent == "Asia"])
europe_asia_imports_t_test
##
## Welch Two Sample t-test
##
## data: data_as_eu_after_90$imports[data_as_eu_after_90$continent == "Europe"] and data_as_eu_after_90$imports[data_as_eu_after_90$continent == "Asia"]
## t = -1.3552, df = 137.53, p-value = 0.1776
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -12.433240 2.321099
## sample estimates:
## mean of x mean of y
## 41.78924 46.84531
What is the country (or countries) that has the highest ‘Population density (people per sq. km of land area)’ across all years?
We can visualize the question in this problem by plotting a line graph showing the population density evolution across the years:
pop_density_plot <- ggplot(data, aes(x = Year, y = log10(population_density), group = Country.Name, color = Country.Name, label = Country.Name)) +
geom_line() +
labs(
y = "Population density (log10(people/square km of land area))", x = "Year",
title = "Population density variation according to year per country",
)
ggplotly(pop_density_plot)
To ascertain which country has the highest average ranking, we can compare them according to a score that consists of the sum of their position in a decreasing ranking of population density spanning all available years. It’s calculated in the following way:
years <- unique(data$Year)
countries <- unique(data$Country.Name[!is.na(data$Country.Name)])
pop_density_ranking <- rep(0, times = length(countries))
names(pop_density_ranking) <- countries
for (x in years) {
year_data <- data %>%
select(Country.Name, Year, population_density) %>%
na.omit() %>%
filter(Year == x)
year_data$population_density <- rank(year_data$population_density, na.last = TRUE)
for (z in year_data$Country.Name) {
pop_density_ranking[[z]] <- pop_density_ranking[[z]] + year_data$population_density[year_data$Country.Name == z]
}
}
So the countries with the highest scores are:
pop_density_ranking <- pop_density_ranking %>%
sort(decreasing = TRUE)
data.frame(Rank = seq(1, 10), Country = names(head(pop_density_ranking, 10)), Score = unname(head(pop_density_ranking, 10))) %>%
kbl(caption = "Countries with the most population density during the 1962-2007 period") %>%
kable_styling(bootstrap_options = "striped", full_width = F)
| Rank | Country | Score |
|---|---|---|
| 1 | Macao SAR, China | 2553 |
| 2 | Monaco | 2553 |
| 3 | Hong Kong SAR, China | 2537 |
| 4 | Singapore | 2529 |
| 5 | Gibraltar | 2518 |
| 6 | Bermuda | 2506 |
| 7 | Malta | 2498 |
| 8 | Bangladesh | 2476 |
| 9 | Channel Islands | 2474 |
| 10 | Maldives | 2461 |
Having already calculated the population density score, we can best visualize it by building a choropleth:
country_codes <- countrycode(names(pop_density_ranking), origin = "country.name", destination = "iso3c")
pop_dens_df <- data.frame(country = names(pop_density_ranking), rank = unname(pop_density_ranking), code = country_codes)
geo_config <- list(
scope = "world",
showocean = TRUE,
oceancolor = toRGB("LightBlue"),
showland = TRUE,
landcolor = toRGB("#e5ecf6")
)
density_choropleth <- plot_ly(pop_dens_df, type = "choropleth", locations = pop_dens_df$code, z = pop_dens_df$rank, text = pop_dens_df$country, colors = "Reds") %>%
layout(title = "Population density ranking dominance in the 1962-2007 period)", geo = geo_config) %>%
colorbar(title = "Arbitrary units")
density_choropleth
What country (or countries) has shown the greatest increase in ‘Life expectancy at birth, total (years)’ since 1962?
To calculate this, we must get the difference between the life expectancy at the first data point and the last one for each country. We can do such a thing in the following manner:
life_expectancy_diff <- rep(0, times = length(countries))
names(life_expectancy_diff) <- countries
for (z in countries) {
country_life_exp <- data %>%
select(Country.Name, Year, life_expectancy) %>%
filter(Country.Name == z)
years <- unique(country_life_exp$Year)
life_expectancy_diff[z] <- country_life_exp$life_expectancy[country_life_exp$Year == tail(years, 1)] - country_life_exp$life_expectancy[country_life_exp$Year == head(years, 1)]
}
life_expectancy_diff <- life_expectancy_diff %>%
sort(decreasing = TRUE)
Then we can present the countries who had the greatest increases in their life expectancy in a bar plot:
life_expectancy_diff_df <- data.frame(names = factor(names(life_expectancy_diff), levels = names(life_expectancy_diff)), life_expectancy_diff)
top_life_exp_bar_plot <- life_expectancy_diff_df %>%
head(10) %>%
ggplot(aes(names, life_expectancy_diff, color = names, fill = names)) +
geom_col() +
theme(axis.text.x = element_text(angle = 90)) +
theme(legend.position = "none") +
labs(
y = "Life expectancy difference (in years)", x = "Country",
title = "Most profound gains in life expectancy during 1962-2007 period"
)
ggplotly(top_life_exp_bar_plot)
As we can also see the worst results in this period:
rev_life_expectancy_diff <- life_expectancy_diff %>%
sort(decreasing = FALSE)
rev_life_expectancy_diff_df <- life_expectancy_diff_df <- data.frame(names = factor(names(rev_life_expectancy_diff), levels = names(rev_life_expectancy_diff)), rev_life_expectancy_diff)
low_life_exp_bar_plot <- rev_life_expectancy_diff_df %>%
head(10) %>%
ggplot(aes(names, rev_life_expectancy_diff, color = names, fill = names)) +
geom_col() +
theme(axis.text.x = element_text(angle = 90)) +
theme(legend.position = "none") +
labs(
y = "Life expectancy difference (in years)", x = "Country",
title = "Worst changes in life expectancy during 1962-2007 period"
)
ggplotly(low_life_exp_bar_plot)
And at last, a choropleth plot would be useful for presenting the bigger picture:
le_country_codes <- countrycode(names(life_expectancy_diff), origin = "country.name", destination = "iso3c")
life_expectancy_df <- data.frame(country = names(life_expectancy_diff), years = unname(life_expectancy_diff), code = le_country_codes)
geo_config <- list(
scope = "world",
showocean = TRUE,
oceancolor = toRGB("LightBlue"),
showland = TRUE,
landcolor = toRGB("#e5ecf6")
)
le_choropleth <- plot_ly(life_expectancy_df, type = "choropleth", locations = life_expectancy_df$code, z = life_expectancy_df$years, text = life_expectancy_df$country, colors = "Reds") %>%
layout(title = "Changes in life expectancy in the 1962-2007 period)", geo = geo_config) %>%
colorbar(title = "Years")
le_choropleth